#load relevant libraries
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
library(ggplot2)
#Import data into a new data frame
MasterData<-read.csv("~/downloads/FRE1120 Data Summary - RAW.csv")
#Removing students that were not required to do the LS exercises
MasterData<-MasterData[MasterData$LS.Required=="y",]
#Removing the few students with linguistic background different than English, Spnaish, or Creole due to very low numbers
MasterData<-MasterData[MasterData$Linguistic.Background %in% c("English","Creole","Spanish"),]
#Creating the calculated fields of Per_aware and Per_correct
MasterData <- MasterData %>% mutate(Per_correct = Correct...aware+Correct...unaware)
MasterData <- MasterData %>% mutate(Per_aware = Correct...aware+Incorrect...aware)
#eliminating redundant columns
MasterData<- select(MasterData, "ID..","Sex","Linguistic.Background","Final.Grade","Incorrect...unaware","Per_correct","Per_aware","Time.Spent..HW.","Time.Spent..Pronunciation.Practice.","Time.Spent..LearnSmart.","Total.HW...Correct","Total.LS...Complete",)
#Changing column names
names(MasterData) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS")
#Examine scatter plots for different variable combination pairs
#Creating scatter plots for all pairs
pairs(MasterData)

#Time spent on Connect homework, Percentage of correct homework responses
ggplot(MasterData, aes(Time_hw,Per_correct_hw))+ geom_point() + stat_sum(aes(group = 1))

#Time spent on Connect homework, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(Time_hw,Time_LS))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of correct homework responses, Final course grade
ggplot(MasterData, aes(Per_correct_hw,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of correct LearnSmart responses, Percentage of awareness of correct/incorrect responses
ggplot(MasterData, aes(Per_correct,Per_aware))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of completion of assigned LearnSmart activities, Final course grade
ggplot(MasterData, aes(Per_complete_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of completion of assigned LearnSmart activities, Percentage of incorrect and unaware LearnSmart responses
ggplot(MasterData, aes(Per_complete_LS,IU))+ geom_point() + stat_sum(aes(group = 1))

#Time spent on LearnSmart adaptive activities, Final course grade
ggplot(MasterData, aes(Time_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Final course grade
ggplot(MasterData, aes(IU,Final_grade))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(IU,Time_LS))+ geom_point() + stat_sum(aes(group = 1))

#Percentage of incorrect and unaware LearnSmart responses, Time spent on Connect homework
ggplot(MasterData, aes(IU,Time_hw))+ geom_point() + stat_sum(aes(group = 1))

#Create new data frame for scaled data, removing non-clustering variables
ScaledData<-as.data.frame(cbind(MasterData[,1:3],scale(select(MasterData, Final_grade:Per_complete_LS))))
#Validating scaling by looking at means and standard deviations of the scaled columns
sapply(ScaledData[,4:12], mean)
Final_grade IU Per_correct Per_aware Time_hw Time_pron Time_LS
5.843135e-17 -1.133857e-16 2.872520e-16 -4.121555e-16 -3.776554e-17 5.052708e-17 8.279565e-17
Per_correct_hw Per_complete_LS
2.015534e-16 4.870925e-18
sapply(ScaledData[,4:12], sd)
Final_grade IU Per_correct Per_aware Time_hw Time_pron Time_LS
1 1 1 1 1 1 1
Per_correct_hw Per_complete_LS
1 1
#Creating the Within Group Sum of Squares function
wssplot <- function(data, nc=15, seed=1234){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
#Create the WSS Graph
wssplot(ScaledData[,4:12],nc=15,seed=1234)

#Create and examine several different possible cluster solutions
threeclusterkmeans<-kmeans(ScaledData[,4:12], 3, nstart=10)
threeclusterkmeans
K-means clustering with 3 clusters of sizes 15, 68, 46
Cluster means:
Final_grade IU Per_correct Per_aware Time_hw Time_pron Time_LS Per_correct_hw Per_complete_LS
1 -1.66048356 -0.4683417 0.4338804 0.2462526 -1.34682486 -0.38218605 -0.7236692 -2.050320950 -2.0744039
2 0.35681501 -0.5420303 0.4596331 0.5090781 0.02322455 -0.03493209 -0.2628125 0.450550769 0.2919678
3 0.01399637 0.9539823 -0.8209404 -0.8328501 0.40485007 0.17626462 0.6244845 0.002551347 0.2448315
Clustering vector:
[1] 3 2 2 2 1 3 3 3 2 1 3 1 2 1 3 2 2 2 1 2 2 3 2 3 1 1 2 2 3 2 2 2 1 2 3 2 1 3 3 1 3 3 3 3 3 3 2 2 2 2 3 3 3 2 2 3 3 3 2 1 2
[62] 2 3 2 2 3 2 2 1 2 2 2 2 2 3 3 3 2 2 3 2 2 2 3 2 2 3 2 1 3 3 3 2 2 1 2 2 2 3 2 2 2 3 3 2 2 2 2 2 2 3 1 3 2 2 3 3 2 2 2 2 2
[123] 3 3 3 2 2 2 3
Within cluster sum of squares by cluster:
[1] 108.0055 317.0107 294.2538
(between_SS / total_SS = 37.6 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size" "iter"
[9] "ifault"
fourclusterkmeans<-kmeans(ScaledData[,4:12], 4, nstart=10)
fourclusterkmeans
K-means clustering with 4 clusters of sizes 19, 15, 37, 58
Cluster means:
Final_grade IU Per_correct Per_aware Time_hw Time_pron Time_LS Per_correct_hw Per_complete_LS
1 -0.3507149 1.1769446 -0.9840832 -1.1188646 0.87451648 1.0314872 1.5405665 -0.1017225 0.1292924
2 -1.6964014 -0.6400324 0.4608217 0.2805430 -1.33097412 -0.3821861 -0.7669705 -1.8427475 -2.1677251
3 0.5572135 -0.7840533 0.9818083 0.8836826 0.07144945 0.2390765 -0.2894854 0.5519276 0.4797214
4 0.1981501 0.2801468 -0.4231319 -0.2697582 0.01215809 -0.3915741 -0.1216422 0.1578038 0.2122350
Clustering vector:
[1] 1 4 4 4 2 4 4 4 2 2 4 2 3 2 1 3 3 3 2 4 3 4 4 4 2 2 4 3 1 3 3 4 2 4 1 3 2 1 1 2 1 1 1 1 1 4 3 4 3 3 4 4 4 4 3 1 4 4 4 2 3
[62] 3 1 4 4 4 3 4 4 3 4 3 3 3 4 1 4 4 4 4 3 4 3 4 4 4 1 4 2 4 4 4 4 3 2 3 4 3 1 3 3 3 4 4 4 4 4 4 3 3 1 2 4 4 4 4 1 3 3 3 3 3
[123] 1 4 4 4 3 3 4
Within cluster sum of squares by cluster:
[1] 162.9807 111.4874 159.6486 194.7688
(between_SS / total_SS = 45.4 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size" "iter"
[9] "ifault"
#Assign the clusters for each observation for k=3,4 to a new dataframe
Clusters<-data.frame(ScaledData, as.factor(threeclusterkmeans$cluster), as.factor(fourclusterkmeans$cluster))
names(Clusters) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS","ClusterToThree","ClusterToFour")
#Graph the different solutions - Three Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language"
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(IU,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(IU,Time_LS, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

ggplot(Clusters, aes(IU,Time_hw, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))

#Graph the different solutions - Four Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(IU,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(IU,Time_LS, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

ggplot(Clusters, aes(IU,Time_hw, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))

for (j in langs) {
LangsCount[j,1] <- j
for (i in 1:4){
LangsCount[j,i] <- sum(UnscaledClusters$Language==j & UnscaledClusters$ClusterToFour==i)
}
}
invalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generatedinvalid factor level, NA generated
---
title: "R Notebook"
output: html_notebook
---
  
```{r}
#load relevant libraries
library(dplyr)
library(ggplot2)

#Import data into a new data frame
MasterData<-read.csv("~/downloads/FRE1120 Data Summary - RAW.csv")
#Removing students that were not required to do the LS exercises
MasterData<-MasterData[MasterData$LS.Required=="y",]
#Removing the few students with linguistic background different than English, Spnaish, or Creole due to very low numbers
MasterData<-MasterData[MasterData$Linguistic.Background %in% c("English","Creole","Spanish"),]
#Creating the calculated fields of Per_aware and Per_correct
MasterData <- MasterData %>% mutate(Per_correct = Correct...aware+Correct...unaware)
MasterData <- MasterData %>% mutate(Per_aware = Correct...aware+Incorrect...aware)
#eliminating redundant columns
MasterData<- select(MasterData, "ID..","Sex","Linguistic.Background","Final.Grade","Incorrect...unaware","Per_correct","Per_aware","Time.Spent..HW.","Time.Spent..Pronunciation.Practice.","Time.Spent..LearnSmart.","Total.HW...Correct","Total.LS...Complete",)
#Changing column names
names(MasterData) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS")

```

```{r}
#Examine scatter plots for different variable combination pairs
#Creating scatter plots for all pairs
pairs(MasterData)
#Time spent on Connect homework, Percentage of correct homework responses
ggplot(MasterData, aes(Time_hw,Per_correct_hw))+ geom_point() + stat_sum(aes(group = 1))
#Time spent on Connect homework, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(Time_hw,Time_LS))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of correct homework responses, Final course grade
ggplot(MasterData, aes(Per_correct_hw,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of correct LearnSmart responses, Percentage of awareness of correct/incorrect responses
ggplot(MasterData, aes(Per_correct,Per_aware))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of completion of assigned LearnSmart activities, Final course grade
ggplot(MasterData, aes(Per_complete_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of completion of assigned LearnSmart activities, Percentage of incorrect and unaware LearnSmart responses
ggplot(MasterData, aes(Per_complete_LS,IU))+ geom_point() + stat_sum(aes(group = 1))
#Time spent on LearnSmart adaptive activities, Final course grade
ggplot(MasterData, aes(Time_LS,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Final course grade
ggplot(MasterData, aes(IU,Final_grade))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Time spent on LearnSmart adaptive activities
ggplot(MasterData, aes(IU,Time_LS))+ geom_point() + stat_sum(aes(group = 1))
#Percentage of incorrect and unaware LearnSmart responses, Time spent on Connect homework
ggplot(MasterData, aes(IU,Time_hw))+ geom_point() + stat_sum(aes(group = 1))
```

```{r}
#Create new data frame for scaled data, removing non-clustering variables
ScaledData<-as.data.frame(cbind(MasterData[,1:3],scale(select(MasterData, Final_grade:Per_complete_LS))))
#Validating scaling by looking at means and standard deviations of the scaled columns
sapply(ScaledData[,4:12], mean)
sapply(ScaledData[,4:12], sd)
```

```{r}
#Creating the Within Group Sum of Squares function
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

#Create the WSS Graph
wssplot(ScaledData[,4:12],nc=15,seed=1234)
```
```{r}
#Create and examine several different possible cluster solutions
threeclusterkmeans<-kmeans(ScaledData[,4:12], 3, nstart=10)
threeclusterkmeans
fourclusterkmeans<-kmeans(ScaledData[,4:12], 4, nstart=10)
fourclusterkmeans
```
```{r}
#Assign the clusters for each observation for k=3,4 to a new dataframe
Clusters<-data.frame(ScaledData, as.factor(threeclusterkmeans$cluster), as.factor(fourclusterkmeans$cluster))
names(Clusters) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS","ClusterToThree","ClusterToFour")
```

```{r}
#Graph the different solutions - Three Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language"
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(ClusterToThree))) + geom_point((aes(shape=Language, color = factor(ClusterToThree))))
```

```{r}
#Graph the different solutions - Four Clusters.
#Shapes indicate Language Background. To change which variable is marked by shape, change the name of the variable in "shape=Language
ggplot(Clusters, aes(Time_hw,Per_correct_hw, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Time_hw,Time_LS, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Per_correct_hw,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Per_correct,Per_aware, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Per_complete_LS,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Per_complete_LS,IU, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(Time_LS,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(IU,Final_grade, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(IU,Time_LS, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
ggplot(Clusters, aes(IU,Time_hw, group = factor(ClusterToFour))) + geom_point((aes(size=2,shape=Language, color = factor(ClusterToFour))))
```



```{r}
#Descriptive stats for each cluster in the 4 cluster solution
UnscaledClusters<-data.frame(MasterData, as.factor(threeclusterkmeans$cluster), as.factor(fourclusterkmeans$cluster))
names(UnscaledClusters) <- c("ID","Sex","Language","Final_grade","IU","Per_correct","Per_aware","Time_hw","Time_pron","Time_LS","Per_correct_hw","Per_complete_LS","ClusterToThree","ClusterToFour")
#creating languages vector and language count df
langs <- c("English","Spanish","Creole")
LangsCount <- data.frame(cluster1count=integer(), cluster2count=integer(), cluster3count=integer(), cluster4count=integer(), stringsAsFactors=TRUE)

for (j in langs) {
    LangsCount[j,1] <- j
    for (i in 1:4){
      LangsCount[j,i] <- sum(UnscaledClusters$Language==j & UnscaledClusters$ClusterToFour==i)
}
}
```



